The benchmarks in the present analysis rely on our own previous analysis and annotation of papers, as well as on open resources such as Papers With Code, including data from from several repositories (e.g, EFF, NLP-progress, SQuAD, RedditSota, etc.).

We focus on “interest” rather than “progress” for AI benchmarks as this is something we can compute using some proxies.

For this analysis we use the normalised hits obtained on AItopics per benchmark over the last decade (2008-2019). Note that the results from 2019 are incomplete.

prepareVis <- function(data, rangeMean = 5, norm = T, years = 2008:2019) {

  
  set.seed(288)
  interest.df <- data
  
  ini <- which(colnames(interest.df) == paste0("X",years[1]))
  fin <- which(colnames(interest.df) == paste0("X",years[length(years)]))
  colnames(interest.df)[ini:fin] <- as.character(2008:2019)
  
  
  r <- 1:((length(years)-rangeMean-1))
  years.range <- years[-r]
  interest.df = colwise(type.convert)(interest.df)
  interest.df$mean.Interest <- rowSums(select(interest.df, as.character(years.range)))/(rangeMean+1)
  
  
  # filter(interest.df, is.na(R2))
  # filter(interest.df, R2=="ERROR")
  # # interest.df <- filter(interest.df, !is.na(R2))
  
  
  interest.df$category  <- NA
  for(i in 1:nrow(interest.df)){
    interest.df$category[i] <- str_trim(str_split(interest.df$TaskHierarchies[i], pattern = ">")[[1]][1])
  } # unique(interest.df$category)
  
  
  cogAbs <- c("MP", "SI", "VP", "AP", "AS", "PA", "CE", "CO", "EC", "NV", "CL", "QL", "MS", "MC")
  
  interest <- select(interest.df, one_of(c("keyword", "category",cogAbs,"mean.Interest")))
  keywords <- interest$keyword
  categories <- interest$category
  rownames(interest) <- keywords
  
  
  interest <- interest[,-(1:2)]
  interest["ILSVRC","QL"] <- 0 # <------------------- check it!
  interest = colwise(type.convert)(interest)
  rownames(interest) <- keywords
  interest[interest$mean.Interest ==0, "mean.Interest"]<- 0.0000000000001
  
  
  interest.mean <- interest$mean.Interest
  
  
  if(norm){
    # interest.mean <-  (interest.mean-min(interest.mean))/(max(interest.mean)-min(interest.mean))
    interest <- select(interest, -mean.Interest)
    interest.mean.norm <- normalize(interest.mean+0.000001, method = "scale", range = c(0,1))
    range(interest.mean.norm)
    interest.pond <- interest * interest.mean.norm
    return(list(interest.pond, interest, interest.mean, interest.mean.norm))
    
  }else{
    interest <- select(interest, -mean.Interest)
    interest.pond <- interest * interest.mean
    return(list(interest.pond, interest, interest.mean, interest.mean))
  }
  
}
plotVis <- function(data, categories, norm = T){
    set.seed(288)


  # shapes =  c("square", "triangle", "box", "circle", "dot", "star",
  #             "ellipse", "database", "text", "diamond", "square", "triangle","box")
  # vis$nodes$shape <- c(shapes[as.numeric(as.factor(categories))], rep("#dot",14))
  
  colours = c("1" = "blalck", "2" = "#543005","3" = "#8c510a","4" = "#bf812d",
              "5" = "#dfc27d","6" = "#f6e8c3","7" = "#f5f5f5","8" = "#c7eae5",
              "9" = "#80cdc1", "10" = "#35978f", "11" = "#01665e", "12" = "#003c30", "13" = "#FAFAFA")
  
  vis <- toVisNetworkData(graph_from_incidence_matrix(data, directed = F, weighted = T))
  
  vis$nodes$value = c(rep(10, nrow(vis$nodes)-14), colSums(data)*10000)
  vis$nodes$title <- vis$nodes$label
  vis$nodes$category <- c(categories, rep("CogAb", 14))
  vis$nodes$group <- vis$nodes$category 
  vis$nodes$color <- colours[as.numeric(as.factor(vis$nodes$category))]
 
  # vis$edges$value <- log(vis$edges$weight+1)
  vis$edges$value <- log(normalize(vis$edges$weight+0.00001, method = "range", range = c(0,1))+0.00001)
  
  # vis$edges$width <- vis$edges$weight

  v <- visNetwork(vis$nodes, vis$edges,  height = "1000px", width = "100%") %>% 
    visEdges(arrows = "to", color = list(color = 'rgba(70,130,180,0.3)', highlight ="#4682B4")) %>%
    visIgraphLayout(
      physics = F,
      randomSeed = 2017,
      layout = "layout_with_fr"
    ) %>%  
    visInteraction(navigationButtons = TRUE) %>% 
    visOptions(selectedBy = "group",highlightNearest = TRUE )
  
  return(v)
  
}

Mean Interest

interest.df <- read.xlsx2("interest_kw_processed_raw_slope.xlsx", sheetIndex = 1)

df.interest <- data.frame(Benchmark = rownames(prepareVis(interest.df, length(years)-1, norm = F)[[1]]),
                          Last.Decade = prepareVis(interest.df, length(years)-1, norm = F)[[3]], 
                          Last.Lustrum = prepareVis(interest.df, 5, norm = F)[[3]], 
                          Last.Year = prepareVis(interest.df, 1, norm = F)[[3]])

df.interest.m <- melt(df.interest, id.vars = "Benchmark")

a <- ggplot(df.interest.m, aes(reorder(Benchmark,value), value, colour = variable)) + 
  geom_point(alpha = 1/3, size = 3.5) + xlab("Mean Interest") + ylab("") + 
  coord_flip() + theme_minimal() + theme(legend.position="bottom")

b <- ggplot(df.interest.m, aes(reorder(Benchmark,value), log(value), colour = variable)) + 
  geom_point(alpha = 1/3, size = 3.5) + xlab("log(Mean Interest)") + ylab("") + 
  coord_flip() + theme_minimal() + theme(legend.position="bottom")

a
b

Mapping between AI benchmarks and Cognitive Abilities

Graphical representation

  • Benchmarks are grouped and coloured by area (groups from https://paperswithcode.com/)
  • Cognitive abilities are coloured in black and its size represent its relevance (total sum in the mapping) ponderated by interest (previous plot).
  • Edges represent that an ability is assigned to a task.
  • The width of the edges represent “interest” on the benchmark: the wider the edge, the more interest from the community during the last decade (mean).

Note that we can perform exactly the same analysis focusing on different (ranges of) years and obtaining the same graph but the width of the edges may vary (a little bit).

(Network are interactive!)

Last decade (2008-2019)

interest.df$category  <- NA
for(i in 1:nrow(interest.df)){
  interest.df$category[i] <- str_trim(str_split(interest.df$TaskHierarchies[i], pattern = ">")[[1]][1])
}

categories <- interest.df$category


interest.pond.All <- prepareVis(interest.df, length(years)-1, norm = F)
plotVis(interest.pond.All[[1]], categories)

Last lustrum (2014-2019)

Almost unnoticeable differences regarding the with of the edges (due to de size of the graph and the small variations regarding mean interest).

interest.pond.5 <- prepareVis(interest.df, 5, norm = F)
plotVis(interest.pond.5[[1]], categories)

Relevance of the cognitive abilities

  • Left: total sum
  • Right: total sum ponderated by mean interest over the last decade (normalised average number of documents).

Last decade (2008-2019)

barplot(colSums(interest.pond.All[[1]]), main = "Total sum") # interest.pondered
barplot(colSums(interest.pond.All[[2]]), main = "Total sum (pondered by mean interest)") # interest.sumcols

Last lustrum (2014-2019)

barplot(colSums(interest.pond.5[[1]]), main = "Total sum") # interest.pondered
barplot(colSums(interest.pond.5[[2]]), main = "Total sum (pondered by mean interest)") # interest.sumcols

Interest per benchmark.

(Groups from https://paperswithcode.com/)

Computer Vision

plotIterest.Cat(interest.m, "Computer Vision")

Graphs

plotIterest.Cat(interest.m, "Graphs")

Natural Language Processing

plotIterest.Cat(interest.m, "Natural Language Processing")

Playing Games

plotIterest.Cat(interest.m, "Playing Games")

Miscellaneous

plotIterest.Cat(interest.m, "Miscellaneous")

Medical

plotIterest.Cat(interest.m, "Medical")

Methodology

plotIterest.Cat(interest.m, "Methodology")

Speech

plotIterest.Cat(interest.m, "Speech")

Reasoning

plotIterest.Cat(interest.m, "Reasoning")

Time Series

plotIterest.Cat(interest.m, "Time Series")

Computer Code

plotIterest.Cat(interest.m, "Computer Code")